# ===============================================
# RX 480 / GPU Full Saturation Ceiling Finder
# ===============================================
import pyopencl as cl
import numpy as np
import time
import math

# ---------------------------
# Auto-detect GPU
# ---------------------------
device = None
for platform in cl.get_platforms():
    for d in platform.get_devices():
        if d.type & cl.device_type.GPU:
            device = d
            break
    if device:
        break

if device is None:
    raise RuntimeError("No GPU devices found via OpenCL")

ctx = cl.Context([device])
queue = cl.CommandQueue(ctx)

print("Using device:", device.name)
vram_bytes = device.global_mem_size
print("VRAM (GB):", vram_bytes/1024**3)
print("Compute Units:", device.max_compute_units)
print("Max Clock (MHz):", device.max_clock_frequency)

# ---------------------------
# Recursive expansion model
# ---------------------------
def expansion(depth):
    return 8**depth

# Each seed uses 64 bytes
seed_size = 64
max_seeds = vram_bytes // seed_size
print("Max seeds based on VRAM:", max_seeds)

# ---------------------------
# OpenCL Kernel
# ---------------------------
kernel_code = """
__kernel void recurse(
    __global float *data,
    const int expansion)
{
    int gid = get_global_id(0);
    float x = data[gid];
    for(int i=0; i<expansion; i++){
        x = sqrt(x * 1.618f + 0.5f) * 1.0001f;
    }
    data[gid] = x;
}
"""
program = cl.Program(ctx, kernel_code).build()

# ---------------------------
# Ceiling Finder Loop
# ---------------------------
depth = 1
N = int(min(max_seeds, 2**24))  # start with 16M seeds if VRAM allows

while True:
    expansion_factor = expansion(depth)
    print(f"\n[Testing] Depth={depth}, N={N:,}, Expansion={expansion_factor:,}")

    # Allocate buffer
    data = np.random.rand(N).astype(np.float32)
    buf = cl.Buffer(ctx, cl.mem_flags.READ_WRITE | cl.mem_flags.COPY_HOST_PTR, hostbuf=data)

    # Warmup
    evt = program.recurse(queue, (N,), None, buf, np.int32(expansion_factor))
    evt.wait()

    # Timed run
    t0 = time.time()
    for _ in range(3):
        evt = program.recurse(queue, (N,), None, buf, np.int32(expansion_factor))
    evt.wait()
    dt = (time.time() - t0) / 3.0

    fps = 1.0 / dt
    vram_used = data.nbytes / 1024**2
    flops = (N * expansion_factor) / dt / 1e9

    print(f"Depth {depth} | N={N:,} | VRAM={vram_used:.1f} MB | {fps:.2f} FPS | {flops:.2f} GFLOPs")

    # Stop if FPS drops below 1 or VRAM overflow
    if fps < 1 or data.nbytes > vram_bytes:
        print("\n[Ceiling reached]")
        break

    depth += 1
